#First pass at EITC Validation graphs
library(readr)
library(haven)
library(tidyverse)
library(magrittr)

#Define functions for later use
add_narm <- function(a,b,c,d,e,f){
  a = ifelse(is.na(a)==TRUE,0,a)
  b = ifelse(is.na(b)==TRUE,0,b)
  c = ifelse(is.na(c)==TRUE,0,c)
  d = ifelse(is.na(d)==TRUE,0,d)
  e = ifelse(is.na(e)==TRUE,0,e)
  f = ifelse(is.na(f)==TRUE,0,f)
  a+b+c+d+e+f
}

sum_narm <- function(x){
  sum(x,na.rm = TRUE)
}

mean_narm <- function(x){
  mean(x,na.rm = TRUE)
}
#Load the data
reg_data <- read.csv("Z:/Race_Prediction/Data/NTJ Short Paper/reg_data.csv")
load("X:/public/demographic_tax_data/LIHTC/Data/CI_boot_samples/sp_summaries/all.RData")
CI_data <- as.data.frame(diffs.eitc_unc.incbin.clust)
CI_data$age_bin <- as.integer(CI_data$agi.bin.filers)

#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#+#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Bounding Exercise (from Elzayn, et al.)
# Differences by AGI category: Non-White - White
#Test EITC
eitc <- as.data.frame(subset(reg_data,reg_data$filer==1))
eitc$eic_ind <- ifelse(eitc$eic>0 & is.na(eitc$eic)==FALSE,1,0)

#Define AGI categories

cutoffs <- rep(0,10)
for(i in 1:10){
  cutoffs[i] <- quantile(eitc$agi,(0.1*i))
}

eitc %<>%
  mutate(agi_cat = ifelse(agi<cutoffs[1],1,NA),
         agi_cat = ifelse(agi>=cutoffs[1] & agi<cutoffs[2],2,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[2] & agi<cutoffs[3],3,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[3] & agi<cutoffs[4],4,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[4] & agi<cutoffs[5],5,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[5] & agi<cutoffs[6],6,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[6] & agi<cutoffs[7],7,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[7] & agi<cutoffs[8],8,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[8] & agi<cutoffs[9],9,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[9],10,agi_cat))

#Difference in conditional means between black and white

#Probabilistic Estimator
true_val <- rep(0,10)
for(i in 1:10){
  true_val[i] <- mean(subset(eitc$eic,eitc$demographic==2 & eitc$agi_cat==i))-
    mean(subset(eitc$eic,eitc$demographic==1 & eitc$agi_cat==i))
}

predicted <- rep(0,10)
for(i in 1:10){
  predicted[i] <- sum_narm(subset(eitc$eic,eitc$agi_cat==i)*subset(eitc$prob_b2,eitc$agi_cat==i)/
                             sum_narm(subset(eitc$prob_b2,eitc$agi_cat==i))) -
    sum_narm(subset(eitc$eic,eitc$agi_cat==i)*subset(eitc$prob_b1,eitc$agi_cat==i)/
               sum_narm(subset(eitc$prob_b1,eitc$agi_cat==i)))
}


#Linear estimator
eitc2 <- as.data.frame(eitc)

predict_linear <- rep(0,10)
for(i in 1:10){
  eitc_sub = subset(eitc2,eitc2$agi_cat==i)
  regression = lm(eitc ~ prob_b2 + prob_b3 + prob_b4 + prob_b5 + prob_b6, data = eitc_sub)
  predict_linear[i] <- regression$coefficients[2]
  
}  

#X-values for graph
xvals <- seq(0.1,1,0.1)

#Graphing colors
library(RColorBrewer)
clr <- brewer.pal(8,"Set1")

#Graph EITC Take-up truth versus predicted
par(mfrow=c(2,2))
par(bg=NA)

# Draw the two bar chart using above datasets
plot(xvals,true_val,ylim=c(-50,3000),col="white",
     main="Black - White",
     xlab = "Sample Income Decile",
     ylab="U.S. Dollars",
     yaxt="n",xaxt="n",
     cex.lab=1.1, cex.main=1.3)
abline(h=0,col="gray",lty=2)
lines(xvals,true_val,col=clr[2],lwd=4)
lines(xvals,predicted,col= "firebrick",lwd=4,lty=4)
lines(xvals,predict_linear,col="black",lwd=4,lty=4)
axis(side=2, at=c(0,1000,2000,3000),
     labels=c("$0","$1,000","$2,000","$3,000"))
axis(side=1, at=c(0.2,0.4,0.6,0.8,1),
     labels=c("20%","40%","60%","80%","100%"))
legend("topleft", c("Observed","Probabilistic Est.","Linear Est."), lwd=c(4,4,4),
       lty=c(1,4,4),
       col=c(clr[2],"firebrick","black"),cex=1,
       bty="n")





#Difference in conditional means between hispanic and white

#Probabilistic Estimator
true_val <- rep(0,10)
for(i in 1:10){
  true_val[i] <- mean(subset(eitc$eic,eitc$demographic==6 & eitc$agi_cat==i)) -
    mean(subset(eitc$eic,eitc$demographic==1 & eitc$agi_cat==i))
}

predicted <- rep(0,10)
for(i in 1:10){
  predicted[i] <- sum_narm(subset(eitc$eic,eitc$agi_cat==i)*subset(eitc$prob_b6,eitc$agi_cat==i)/
                             sum_narm(subset(eitc$prob_b6,eitc$agi_cat==i))) -
    sum_narm(subset(eitc$eic,eitc$agi_cat==i)*subset(eitc$prob_b1,eitc$agi_cat==i)/
               sum_narm(subset(eitc$prob_b1,eitc$agi_cat==i)))
}


#Linear estimator
eitc2 <- as.data.frame(eitc)

predict_linear <- rep(0,10)
for(i in 1:10){
  eitc_sub = subset(eitc2,eitc2$agi_cat==i)
  regression = lm(eitc ~ prob_b2 + prob_b3 + prob_b4 + prob_b5 + prob_b6, data = eitc_sub)
  predict_linear[i] <- regression$coefficients[6]
  
}  

#X-values for graph
xvals <- seq(0.1,1,0.1)


# Draw the two bar chart using above datasets
plot(xvals,true_val,ylim=c(-50,2500),col="white",
     main="Hispanic - White",
     xlab = "Sample Income Decile",
     ylab="U.S. Dollars",
     yaxt="n",xaxt="n",
     cex.lab=1.1, cex.main=1.3)
abline(h=0,col="gray",lty=2)
lines(xvals,true_val,col=clr[2],lwd=4)
lines(xvals,predicted,col= "firebrick",lwd=4,lty=4)
lines(xvals,predict_linear,col="black",lwd=4,lty=4)
axis(side=2, at=c(0,1000,2000),
     labels=c("$0","$1,000","$2,000"))
axis(side=1, at=c(0.2,0.4,0.6,0.8,1),
     labels=c("20%","40%","60%","80%","100%"))
legend("topleft", c("Observed","Probabilistic Est.","Linear Est."), lwd=c(4,4,4),
       lty=c(1,4,4),
       col=c(clr[2],"firebrick","black"),cex=1,
       bty="n")



#Difference in conditional means between ANHPI and white
#Probabilistic Estimator
true_val <- rep(0,10)
for(i in 1:10){
  true_val[i] <- mean(subset(eitc$eic,eitc$demographic==3 & eitc$agi_cat==i)) -
    mean(subset(eitc$eic,eitc$demographic==1 & eitc$agi_cat==i))
}

predicted <- rep(0,10)
for(i in 1:10){
  predicted[i] <- sum_narm(subset(eitc$eic,eitc$agi_cat==i)*subset(eitc$prob_b3,eitc$agi_cat==i)/
                             sum_narm(subset(eitc$prob_b3,eitc$agi_cat==i))) -
    sum_narm(subset(eitc$eic,eitc$agi_cat==i)*subset(eitc$prob_b1,eitc$agi_cat==i)/
               sum_narm(subset(eitc$prob_b1,eitc$agi_cat==i)))
}


#Linear estimator
eitc2 <- as.data.frame(eitc)

predict_linear <- rep(0,10)
for(i in 1:10){
  eitc_sub = subset(eitc2,eitc2$agi_cat==i)
  regression = lm(eitc ~ prob_b2 + prob_b3 + prob_b4 + prob_b5 + prob_b6, data = eitc_sub)
  predict_linear[i] <- regression$coefficients[3]
  
}  

#X-values for graph
xvals <- seq(0.1,1,0.1)



# Draw the two bar chart using above datasets
plot(xvals,true_val,col="white", ylim=c(-750,1000),
     main="ANHPI - White",
     xlab = "Sample Income Decile",
     ylab="U.S. Dollars",
     yaxt="n",xaxt="n",
     cex.lab=1.1, cex.main=1.3)
abline(h=0,col="gray",lty=2)
lines(xvals,true_val,col=clr[2],lwd=4)
lines(xvals,predicted,col= "firebrick",lwd=4,lty=4)
lines(xvals,predict_linear,col="black",lwd=4,lty=4)
axis(side=2, at=c(-500,0,500,1000,1500,2000),
     labels=c("-$500","$0","$500","$1,000","$1,500","$2,000"))
axis(side=1, at=c(0.2,0.4,0.6,0.8,1),
     labels=c("20%","40%","60%","80%","100%"))
legend("topleft", c("Observed","Probabilistic Est.","Linear Est."), lwd=c(4,4,4),
       lty=c(1,4,4),
       col=c(clr[2],"firebrick","black"),cex=1,
       bty="n")



#Difference in conditional means between Native and white
#Probabilistic Estimator
true_val <- rep(0,10)
for(i in 1:10){
  true_val[i] <- mean(subset(eitc$eic,eitc$demographic==4 & eitc$agi_cat==i)) -
    mean(subset(eitc$eic,eitc$demographic==1 & eitc$agi_cat==i))
}

predicted <- rep(0,10)
for(i in 1:10){
  predicted[i] <- sum_narm(subset(eitc$eic,eitc$agi_cat==i)*subset(eitc$prob_b4,eitc$agi_cat==i)/
                             sum_narm(subset(eitc$prob_b4,eitc$agi_cat==i))) -
    sum_narm(subset(eitc$eic,eitc$agi_cat==i)*subset(eitc$prob_b1,eitc$agi_cat==i)/
               sum_narm(subset(eitc$prob_b1,eitc$agi_cat==i)))
}

#Linear estimator
eitc2 <- as.data.frame(eitc)

predict_linear <- rep(0,10)
for(i in 1:10){
  eitc_sub = subset(eitc2,eitc2$agi_cat==i)
  regression = lm(eitc ~ prob_b2 + prob_b3 + prob_b4 + prob_b5 + prob_b6, data = eitc_sub)
  predict_linear[i] <- regression$coefficients[4]
  
}  

#X-values for graph
xvals <- seq(0.1,1,0.1)


# Draw the two bar chart using above datasets
plot(xvals,true_val,col="white", ylim=c(-50,3000),
     main="Native - White",
     xlab = "Sample Income Decile",
     ylab="U.S. Dollars",
     yaxt="n",xaxt="n",
     cex.lab=1.1, cex.main=1.3)
abline(h=0,col="gray",lty=2)
lines(xvals,true_val,col=clr[2],lwd=4)
lines(xvals,predicted,col= "firebrick",lwd=4,lty=4)
lines(xvals,predict_linear,col="black",lwd=4,lty=4)
axis(side=2, at=c(0,1000,2000,3000),
     labels=c("$0","$1,000","$2,000","$3,000"))
axis(side=1, at=c(0.2,0.4,0.6,0.8,1),
     labels=c("20%","40%","60%","80%","100%"))
legend("topleft", c("Observed","Probabilistic Est.","Linear Est."), lwd=c(4,4,4),
       lty=c(1,4,4),
       col=c(clr[2],"firebrick","black"),cex=1,
       bty="n")

#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Differences by AGI category: Non-White compared to Non-White
#Test EITC
eitc <- as.data.frame(subset(reg_data,reg_data$filer==1))

#Define AGI categories
cutoffs <- rep(0,10)
for(i in 1:10){
  cutoffs[i] <- quantile(eitc$agi,(0.1*i))
}

eitc %<>%
  mutate(agi_cat = ifelse(agi<cutoffs[1],1,NA),
         agi_cat = ifelse(agi>=cutoffs[1] & agi<cutoffs[2],2,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[2] & agi<cutoffs[3],3,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[3] & agi<cutoffs[4],4,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[4] & agi<cutoffs[5],5,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[5] & agi<cutoffs[6],6,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[6] & agi<cutoffs[7],7,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[7] & agi<cutoffs[8],8,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[8] & agi<cutoffs[9],9,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[9],10,agi_cat))


#++++++++++++ Graph other differences against each other
#Difference in conditional means between Black and Hispanic
#Probabilistic estimator
true_val <- rep(0,10)
for(i in 1:10){
  true_val[i] <- mean(subset(eitc$eic,eitc$demographic==2 & eitc$agi_cat==i)) -
    mean(subset(eitc$eic,eitc$demographic==6 & eitc$agi_cat==i))
}

predicted <- rep(0,10)
for(i in 1:10){
  predicted[i] <- sum_narm(subset(eitc$eic,eitc$agi_cat==i)*subset(eitc$prob_b2,eitc$agi_cat==i)/
                             sum_narm(subset(eitc$prob_b2,eitc$agi_cat==i))) -
    sum_narm(subset(eitc$eic,eitc$agi_cat==i)*subset(eitc$prob_b6,eitc$agi_cat==i)/
               sum_narm(subset(eitc$prob_b6,eitc$agi_cat==i)))
}


#Linear estimator
eitc2 <- as.data.frame(eitc)

predict_linear <- rep(0,10)
for(i in 1:10){
  eitc_sub = subset(eitc2,eitc2$agi_cat==i)
  regression = lm(eitc ~ prob_b2 + prob_b3 + prob_b4 + prob_b5 + prob_b6, data = eitc_sub)
  predict_linear[i] <- regression$coefficients[2]-regression$coefficients[6]
  
}  

#X-values for graph
xvals <- seq(0.1,1,0.1)

#Graph EITC Take-up truth versus predicted
par(mfrow=c(3,2))
par(bg=NA)

# Draw the two bar chart using above datasets
plot(xvals,true_val,ylim=c(-250,1500),col="white",
     main="Black - Hispanic",
     xlab = "Sample Income Decile",
     ylab="U.S. Dollars",
     yaxt="n",xaxt="n",
     cex.lab=1.3, cex.main=1.5)
abline(h=0,col="gray",lty=2)
lines(xvals,true_val,col=clr[2],lwd=4)
lines(xvals,predicted,col= "firebrick",lwd=4,lty=4)
lines(xvals,predict_linear,col="black",lwd=4,lty=4)
axis(side=2, at=c(-1000,-500,0,500,1000,1500,2000),
     labels=c("-$1,000","-$500","$0","$500","$1,000","$1,500","$2,000"))
axis(side=1, at=c(0.2,0.4,0.6,0.8,1),
     labels=c("20%","40%","60%","80%","100%"))
legend("topleft", c("Observed","Probabilistic Est.","Linear Est."), lwd=c(4,4,4),
       lty=c(1,4,4),
       col=c(clr[2],"firebrick","black"),cex=1,
       bty="n")





#Difference in conditional means between Black and Asian
#Probabilistic Estimator
true_val <- rep(0,10)
for(i in 1:10){
  true_val[i] <- mean(subset(eitc$eic,eitc$demographic==2 & eitc$agi_cat==i)) -
    mean(subset(eitc$eic,eitc$demographic==3 & eitc$agi_cat==i))
}

predicted <- rep(0,10)
for(i in 1:10){
  predicted[i] <- sum_narm(subset(eitc$eic,eitc$agi_cat==i)*subset(eitc$prob_b2,eitc$agi_cat==i)/
                             sum_narm(subset(eitc$prob_b2,eitc$agi_cat==i))) -
    sum_narm(subset(eitc$eic,eitc$agi_cat==i)*subset(eitc$prob_b3,eitc$agi_cat==i)/
               sum_narm(subset(eitc$prob_b3,eitc$agi_cat==i)))
}

#Linear estimator
eitc2 <- as.data.frame(eitc)

predict_linear <- rep(0,10)
for(i in 1:10){
  eitc_sub = subset(eitc2,eitc2$agi_cat==i)
  regression = lm(eitc ~ prob_b2 + prob_b3 + prob_b4 + prob_b5 + prob_b6, data = eitc_sub)
  predict_linear[i] <- regression$coefficients[2]-regression$coefficients[3]
  
}  

#X-values for graph
xvals <- seq(0.1,1,0.1)

# Draw the two bar chart using above datasets
plot(xvals,true_val,ylim=c(-250,2500),col="white",
     main="Black - ANHPI",
     xlab = "Sample Income Decile",
     ylab="U.S. Dollars",
     yaxt="n",xaxt="n",
     cex.lab=1.3, cex.main=1.5)
abline(h=0,col="gray",lty=2)
lines(xvals,true_val,col=clr[2],lwd=4)
lines(xvals,predicted,col= "firebrick",lwd=4,lty=4)
lines(xvals,predict_linear,col="black",lwd=4,lty=4)
axis(side=2, at=c(-1000,-500,0,500,1000,1500,2000,2500),
     labels=c("-$1,000","-$500","$0","$500","$1,000","$1,500","$2,000","$2,500"))
axis(side=1, at=c(0.2,0.4,0.6,0.8,1),
     labels=c("20%","40%","60%","80%","100%"))
legend("topleft", c("Observed","Probabilistic Est.","Linear Est."), lwd=c(4,4,4),
       lty=c(1,4,4),
       col=c(clr[2],"firebrick","black"),cex=1,
       bty="n")




#Difference in conditional means between Hispanic and Asian
#Probabilistic estimator
true_val <- rep(0,10)
for(i in 1:10){
  true_val[i] <- mean(subset(eitc$eic,eitc$demographic==6 & eitc$agi_cat==i)) -
    mean(subset(eitc$eic,eitc$demographic==3 & eitc$agi_cat==i))
}

predicted <- rep(0,10)
for(i in 1:10){
  predicted[i] <- sum_narm(subset(eitc$eic,eitc$agi_cat==i)*subset(eitc$prob_b6,eitc$agi_cat==i)/
                             sum_narm(subset(eitc$prob_b6,eitc$agi_cat==i))) -
    sum_narm(subset(eitc$eic,eitc$agi_cat==i)*subset(eitc$prob_b3,eitc$agi_cat==i)/
               sum_narm(subset(eitc$prob_b3,eitc$agi_cat==i)))
}

#Linear estimator
eitc2 <- as.data.frame(eitc)

predict_linear <- rep(0,10)
for(i in 1:10){
  eitc_sub = subset(eitc2,eitc2$agi_cat==i)
  regression = lm(eitc ~ prob_b2 + prob_b3 + prob_b4 + prob_b5 + prob_b6, data = eitc_sub)
  predict_linear[i] <- regression$coefficients[6]-regression$coefficients[3]
  
}  

#X-values for graph
xvals <- seq(0.1,1,0.1)


# Draw the two bar chart using above datasets
plot(xvals,true_val,ylim=c(-250,2000),col="white",
     main="Hispanic - ANHPI",
     xlab = "Sample Income Decile",
     ylab="U.S. Dollars",
     yaxt="n",xaxt="n",
     cex.lab=1.3, cex.main=1.5)
abline(h=0,col="gray",lty=2)
lines(xvals,true_val,col=clr[2],lwd=4)
lines(xvals,predicted,col= "firebrick",lwd=4,lty=4)
lines(xvals,predict_linear,col="black",lwd=4,lty=4)
axis(side=2, at=c(-1000,-500,0,500,1000,1500,2000),
     labels=c("-$1,000","-$500","$0","$500","$1,000","$1,500","$2,000"))
axis(side=1, at=c(0.2,0.4,0.6,0.8,1),
     labels=c("20%","40%","60%","80%","100%"))
legend("topleft", c("Observed","Probabilistic Est.","Linear Est."), lwd=c(4,4,4),
       lty=c(1,4,4),
       col=c(clr[2],"firebrick","black"),cex=1,
       bty="n")




#Difference in conditional means between Native and Black
#Probabilistic estimator
true_val <- rep(0,10)
for(i in 1:10){
  true_val[i] <- mean(subset(eitc$eic,eitc$demographic==4 & eitc$agi_cat==i)) -
    mean(subset(eitc$eic,eitc$demographic==2 & eitc$agi_cat==i))
}

predicted <- rep(0,10)
for(i in 1:10){
  predicted[i] <- sum_narm(subset(eitc$eic,eitc$agi_cat==i)*subset(eitc$prob_b4,eitc$agi_cat==i)/
                             sum_narm(subset(eitc$prob_b4,eitc$agi_cat==i))) -
    sum_narm(subset(eitc$eic,eitc$agi_cat==i)*subset(eitc$prob_b2,eitc$agi_cat==i)/
               sum_narm(subset(eitc$prob_b2,eitc$agi_cat==i)))
}

#Linear estimator
eitc2 <- as.data.frame(eitc)

predict_linear <- rep(0,10)
for(i in 1:10){
  eitc_sub = subset(eitc2,eitc2$agi_cat==i)
  regression = lm(eitc ~ prob_b2 + prob_b3 + prob_b4 + prob_b5 + prob_b6, data = eitc_sub)
  predict_linear[i] <- regression$coefficients[4]-regression$coefficients[2]
  
}  

#X-values for graph
xvals <- seq(0.1,1,0.1)

# Draw the two bar chart using above datasets
plot(xvals,true_val,ylim=c(-1000,1500),col="white",
     main="Native - Black",
     xlab = "Sample Income Decile",
     ylab="U.S. Dollars",
     yaxt="n",xaxt="n",
     cex.lab=1.3, cex.main=1.5)
# polygon(c(xvals,rev(xvals)),c(CI_low,rev(CI_high)),col="mistyrose",border=NA)
abline(h=0,col="gray",lty=2)
lines(xvals,true_val,col=clr[2],lwd=4)
lines(xvals,predicted,col= "firebrick",lwd=4,lty=4)
lines(xvals,predict_linear,col="black",lwd=4,lty=4)
axis(side=2, at=c(-1000,-500,0,500,1000,1500,2000),
     labels=c("-$1,000","-$500","$0","$500","$1,000","$1,500","$2,000"))
axis(side=1, at=c(0.2,0.4,0.6,0.8,1),
     labels=c("20%","40%","60%","80%","100%"))
legend("topleft", c("Observed","Probabilistic Est.","Linear Est."), lwd=c(4,4,4),
       lty=c(1,4,4),
       col=c(clr[2],"firebrick","black"),cex=1,
       bty="n")


#Difference in conditional means between Native and Asian
#Probabilistic estimator
true_val <- rep(0,10)
for(i in 1:10){
  true_val[i] <- mean(subset(eitc$eic,eitc$demographic==4 & eitc$agi_cat==i)) -
    mean(subset(eitc$eic,eitc$demographic==3 & eitc$agi_cat==i))
}

predicted <- rep(0,10)
for(i in 1:10){
  predicted[i] <- sum_narm(subset(eitc$eic,eitc$agi_cat==i)*subset(eitc$prob_b4,eitc$agi_cat==i)/
                             sum_narm(subset(eitc$prob_b4,eitc$agi_cat==i))) -
    sum_narm(subset(eitc$eic,eitc$agi_cat==i)*subset(eitc$prob_b3,eitc$agi_cat==i)/
               sum_narm(subset(eitc$prob_b3,eitc$agi_cat==i)))
}

#Linear estimator
eitc2 <- as.data.frame(eitc)

predict_linear <- rep(0,10)
for(i in 1:10){
  eitc_sub = subset(eitc2,eitc2$agi_cat==i)
  regression = lm(eitc ~ prob_b2 + prob_b3 + prob_b4 + prob_b5 + prob_b6, data = eitc_sub)
  predict_linear[i] <- regression$coefficients[4]-regression$coefficients[3]
  
}  

#X-values for graph
xvals <- seq(0.1,1,0.1)


# Draw the two bar chart using above data sets
plot(xvals,true_val,ylim=c(-50,2500),col="white",
     main="Native - ANHPI",
     xlab = "Sample Income Decile",
     ylab="U.S. Dollars",
     yaxt="n",xaxt="n",
     cex.lab=1.3, cex.main=1.5)
# polygon(c(xvals,rev(xvals)),c(CI_low,rev(CI_high)),col="mistyrose",border=NA)
abline(h=0,col="gray",lty=2)
lines(xvals,true_val,col=clr[2],lwd=4)
lines(xvals,predicted,col= "firebrick",lwd=4,lty=4)
lines(xvals,predict_linear,col="black",lwd=4,lty=4)
axis(side=2, at=c(-1000,-500,0,500,1000,1500,2000,2500),
     labels=c("-$1,000","-$500","$0","$500","$1,000","$1,500","$2,000","$2,500"))
axis(side=1, at=c(0.2,0.4,0.6,0.8,1),
     labels=c("20%","40%","60%","80%","100%"))
legend("topleft", c("Observed","Probabilistic Est.","Linear Est."), lwd=c(4,4,4),
       lty=c(1,4,4),
       col=c(clr[2],"firebrick","black"),cex=1,
       bty="n")

#Difference in conditional means between Native and Hispanic

#Probabilistic estimator
true_val <- rep(0,10)
for(i in 1:10){
  true_val[i] <- mean(subset(eitc$eic,eitc$demographic==4 & eitc$agi_cat==i)) -
    mean(subset(eitc$eic,eitc$demographic==6 & eitc$agi_cat==i))
}

predicted <- rep(0,10)
for(i in 1:10){
  predicted[i] <- sum_narm(subset(eitc$eic,eitc$agi_cat==i)*subset(eitc$prob_b4,eitc$agi_cat==i)/
                             sum_narm(subset(eitc$prob_b4,eitc$agi_cat==i))) -
    sum_narm(subset(eitc$eic,eitc$agi_cat==i)*subset(eitc$prob_b6,eitc$agi_cat==i)/
               sum_narm(subset(eitc$prob_b6,eitc$agi_cat==i)))
}

#Linear estimator
eitc2 <- as.data.frame(eitc)

predict_linear <- rep(0,10)
for(i in 1:10){
  eitc_sub = subset(eitc2,eitc2$agi_cat==i)
  regression = lm(eitc ~ prob_b2 + prob_b3 + prob_b4 + prob_b5 + prob_b6, data = eitc_sub)
  predict_linear[i] <- regression$coefficients[4]-regression$coefficients[6]
  
}  

#X-values for graphxvals <- seq(0.1,1,0.1)


# Draw the two bar chart using above datasets
plot(xvals,true_val,ylim=c(-250,2000),col="white",
     main="Native - Hispanic",
     xlab = "Sample Income Decile",
     ylab="U.S. Dollars",
     yaxt="n",xaxt="n",
     cex.lab=1.3, cex.main=1.5)
# polygon(c(xvals,rev(xvals)),c(CI_low,rev(CI_high)),col="mistyrose",border=NA)
abline(h=0,col="gray",lty=2)
lines(xvals,true_val,col=clr[2],lwd=4)
lines(xvals,predicted,col= "firebrick",lwd=4,lty=4)
lines(xvals,predict_linear,col="black",lwd=4,lty=4)
axis(side=2, at=c(-1000,-500,0,500,1000,1500,2000),
     labels=c("-$1,000","-$500","$0","$500","$1,000","$1,500","$2,000"))
axis(side=1, at=c(0.2,0.4,0.6,0.8,1),
     labels=c("20%","40%","60%","80%","100%"))
legend("topleft", c("Observed","Probabilistic Est.","Linear Est."), lwd=c(4,4,4),
       lty=c(1,4,4),
       col=c(clr[2],"firebrick","black"),cex=1,
       bty="n")

